
###########################################################################################
####### NCCPPR Analysis for "Legislative Effectiveness in the American States"
##########################################################################################
### *** Includes code to replicate Figure 1 and Table A4
### *** For all other replication materials, see:
### *** For Figures 2-7 and Tables 1-3, see separate `SLES_analysis_main.R` script 
### *** For all Appendix materials except Table A4, see separate `SLES_analysis_appendix.R` script 
###########################################################################################
### Originally run using R version 4.0.2 (2020-06-22)

# rm(list=ls())
options(stringsAsFactors = FALSE, scipen = 999)

### **********************************************************************
### Set Working Directory to Folder Containing Replication Files and Data
### **********************************************************************
working_directory <- ""

setwd(working_directory)

### Install/Load Packages
# install.packages(c("tidyverse", "estimatr", "haven", "huxtable"))
library(tidyverse) # version 1.3.0
library(estimatr)  # version 0.22.0
library(haven)     # version 2.3.1
library(huxtable)  # version 5.0.0

### Create Figures Directory in Working Directory if Not Present
if(!dir.exists("Figures")){
  dir.create("Figures")
}

### Create Tables Directory in Working Directory if Not Present
if(!dir.exists("Tables")){
  dir.create("Tables")
}

#######################################################
## Load Cleaned/Aggregated SLES Data, Subset to NC
####################################################

### North Carolina SLES Data, 2005 - 2012 (Saved via analysis_main script)
nc_SLES <- read_csv("Data/SLES_individual_level.csv", col_types = cols(district = 'c')) %>%
  filter(state == "NC" & term_start %in% 2005:2012) %>%
  mutate(chamber = ifelse(chamber == "lower", "House", "Senate"))
  
glimpse(nc_SLES)

#######################################################
## Load NCCPPR Data via Edwards (2018) and Clean
####################################################
# Data from: Edwards (2018) "Formal Authority, Persuasive Power, and Effectiveness in State Legislatures" State Politics & Policy Quarterly

# Trim/Clean Data
nc_rankings <- haven::read_dta("Data/Edwards_2018_NC_legislators.dta")
nc_rankings <- nc_rankings %>%
  sjlabelled::remove_all_labels() %>% # Zap Stata Labels and Attributes
  haven::zap_formats() %>%
  mutate(state = "NC", 
         term = dplyr::recode(thisTerm, `1` = "2005_2006", `2` = "2007_2008", `3` = "2009_2010", `4` = "2011_2012"),
         chamber = ifelse(isRepresentative == 1, "House", "Senate"),
  ) %>%
  rename(party = myParty,
         name = thissponsor, 
         edwards_les = les_measure) %>%
  select(state, term, chamber, name, party, lawsPassed, matthewsHitRate, bayeshitrate, nccpprRanking, edwards_les) 

### Match Legislators Across Sources (NCCPPR Rankings and SLES)
### Note: The loop print's the name of any legislator matched via string distance (and their match) to verify match is correct
nc_SLES <- mutate(nc_SLES, match_name = NA) %>% as.data.frame()
for(i in 1:nrow(nc_SLES)){
  
  this_sub <- filter(nc_rankings, chamber == nc_SLES$chamber[i] & term == nc_SLES$term[i])
  
  ### Exact Last Name Match
  last_name_vec <- gsub('.+ ', '', gsub(', ii+$| ii+$|, jr\\.$| jr\\.$|, sr\\.$| sr\\.$', '', tolower(this_sub$name)))
  exact_matches <- which(gsub(',.+', '', tolower(nc_SLES$sles_sponsor[i])) == last_name_vec)
  if(length(exact_matches) == 1){
    nc_SLES$match_name[i] <- this_sub[exact_matches,]$name
    next
  }
  
  ### Exact Data Name Match
  exact_matches2 <- which(tolower(nc_SLES$raw_data_name[i]) == tolower(this_sub$name))
  if(length(exact_matches2) == 1){
    nc_SLES$match_name[i] <- this_sub[exact_matches2,]$name
    next
  }
  
  #### Match Based on String Distance (Max = 5)
  name_adj <- paste0(gsub('^[^,]+, ', '', nc_SLES$sles_sponsor[i]), ' ', gsub(',.+', '', nc_SLES$sles_sponsor[i]))
  match <- stringdist::amatch(name_adj, tolower(this_sub$name), maxDist = 5) # Max Typos
  if(!is.na(match)){
    nc_SLES$match_name[i] <- this_sub[match,]$name
    print(paste0(nc_SLES$sles_sponsor[i] , " ------- ", this_sub[match,]$name))
  }
  #print(i)
}
rm(this_sub, i, last_name_vec, exact_matches, exact_matches2, name_adj, match)

##### Examine Matches
# nc_SLES %>% select(chamber, match_name, sles_sponsor) %>% distinct() %>% View()

##### Fix Errors 
nc_SLES[nc_SLES$sles_sponsor == "jones, earl",]$match_name <- "Jones"

#### Merge NCCPPR Rankings to SLES Dataframe
nc_SLES <- left_join(nc_SLES, nc_rankings %>% select(-c(party, lawsPassed, edwards_les)), by = c("state" = "state", "chamber" = "chamber", "term" = "term", "match_name" = "name"))

#### Check/Fill Missing NCCPPR RANKINGS from Edwards Data
## -- NCCPPR Source Data Available By Year: https://nccppr.org/category/legislative-rankings/
## -- Note that most missing individuals (the commented rows) simply do not have NCCPPR scores 
## -- However, the Edwards data appears to have excluded some legislators ranked #1 overall, so adding those back in

#nc_SLES[nc_SLES$term == "2005_2006" & nc_SLES$sles_sponsor == "hall, john d.",]$nccpprRanking <- NA
#nc_SLES[nc_SLES$term == "2005_2006" & nc_SLES$sles_sponsor == "spear, timothy l.",]$nccpprRanking <- NA
nc_SLES[nc_SLES$term == "2005_2006" & nc_SLES$sles_sponsor == "basnight, marc",]$nccpprRanking <- 1
# nc_SLES[nc_SLES$term == "2005_2006" & nc_SLES$sles_sponsor == "bland, c. w.",]$nccpprRanking <- NA # Replaced Scott Thomas (who resigned but has score)
# nc_SLES[nc_SLES$term == "2005_2006" & nc_SLES$sles_sponsor == "horton, hamilton c. jr.",]$nccpprRanking <- NA # Died 1/31/2006
# nc_SLES[nc_SLES$term == "2005_2006" & nc_SLES$sles_sponsor == "miller, william b.",]$nccpprRanking <- NA # Replaced Horton
# nc_SLES[nc_SLES$term == "2007_2008" & nc_SLES$sles_sponsor == "almond, david",]$nccpprRanking <- NA
# nc_SLES[nc_SLES$term == "2007_2008" & nc_SLES$sles_sponsor == "black, jim b.",]$nccpprRanking <- NA 
# nc_SLES[nc_SLES$term == "2007_2008" & nc_SLES$sles_sponsor == "furr, kevin",]$nccpprRanking <- NA # Not included, appointed 8/15/07
nc_SLES[nc_SLES$term == "2007_2008" & nc_SLES$sles_sponsor == "hackney, joe",]$nccpprRanking <- 1
# nc_SLES[nc_SLES$term == "2007_2008" & nc_SLES$sles_sponsor == "hughes, sandra spaulding",]$nccpprRanking <- NA
# nc_SLES[nc_SLES$term == "2007_2008" & nc_SLES$sles_sponsor == "lucas, jeanne h.",]$nccpprRanking <- NA
# nc_SLES[nc_SLES$term == "2009_2010" & nc_SLES$sles_sponsor == "blue, daniel jr.",]$nccpprRanking <- NA
nc_SLES[nc_SLES$term == "2009_2010" & nc_SLES$sles_sponsor == "hackney, joe",]$nccpprRanking <- 1
# nc_SLES[nc_SLES$term == "2009_2010" & nc_SLES$sles_sponsor == "heagarty, chris",]$nccpprRanking <- NA
# nc_SLES[nc_SLES$term == "2009_2010" & nc_SLES$sles_sponsor == "iler, frank",]$nccpprRanking <- NA
# nc_SLES[nc_SLES$term == "2009_2010" & nc_SLES$sles_sponsor == "ingle, dan w.",]$nccpprRanking <- NA
# nc_SLES[nc_SLES$term == "2009_2010" & nc_SLES$sles_sponsor == "parfitt, diane",]$nccpprRanking <- NA
nc_SLES[nc_SLES$term == "2009_2010" & nc_SLES$sles_sponsor == "basnight, marc",]$nccpprRanking <- 1
# nc_SLES[nc_SLES$term == "2009_2010" & nc_SLES$sles_sponsor == "dickson, margaret highsmith",]$nccpprRanking <- NA
# nc_SLES[nc_SLES$term == "2009_2010" & nc_SLES$sles_sponsor == "malone, vernon",]$nccpprRanking <- NA
# nc_SLES[nc_SLES$term == "2009_2010" & nc_SLES$sles_sponsor == "walters, michael",]$nccpprRanking <- NA
# nc_SLES[nc_SLES$term == "2011_2012" & nc_SLES$sles_sponsor == "gibson, pryor",]$nccpprRanking <- NA
# nc_SLES[nc_SLES$term == "2011_2012" & nc_SLES$sles_sponsor == "carney, becky",]$nccpprRanking <- NA
# nc_SLES[nc_SLES$term == "2011_2012" & nc_SLES$sles_sponsor == "forrester, james s.",]$nccpprRanking <- NA
# nc_SLES[nc_SLES$term == "2011_2012" & nc_SLES$sles_sponsor == "westmoreland, wes",]$nccpprRanking <- NA

rm(nc_rankings)

###########################
###### Base Level Correlation
#############################

nc_SLES %>%
  filter(!is.na(nccpprRanking)) %>%
  group_by(state, term, chamber) %>%
  arrange(state, term, chamber, nccpprRanking) %>%
  mutate(nccpprInverseRank = n():1) %>%
  ungroup() %>%
  summarize(
    ### Using inverse for SLES since higher ranks = less effective 
    base_correlation = cor(SLES, nccpprInverseRank),
    rank_correlation = cor(SLES_rank, nccpprRanking))


##########################
### Create Ranked Measures and Re-Orient Data From Wide to Long Format
#######################

nc_gather <- nc_SLES %>%
  filter(!is.na(nccpprRanking)) %>%
  select(state, term, chamber, sles_sponsor, sles_id, match_name, party, in_majority, 
         seniority, Leader, comm_chair, female, pred.race, appt_or_won_special, np_score, ideo_med_distance,
         starts_with("SLES"), sponsor_pass_rate, sponsor_law_rate, 
         matthewsHitRate, bayeshitrate, 
         nccpprRanking
         ) %>%
  group_by(state, term, chamber) %>%
  arrange(state, term, chamber, nccpprRanking) %>%
  mutate(nccpprInverseRank = n():1,
         nccpprRankPercentile = nccpprInverseRank / n(),
         SLES_InverseRank = -SLES_rank + n() + 1,
         bayes_rank = rank(bayeshitrate)
  ) %>%
  gather(measure_name, measure_value, c(SLES, SLES_InverseRank, sponsor_pass_rate, sponsor_law_rate, matthewsHitRate, bayeshitrate, bayes_rank)) %>%
  arrange(term, chamber, sles_sponsor, measure_name)  %>%
  ungroup()


################################################################
## Figure 1: Criterion Validation in North Carolina
#################################################################

fig_one_df <- nc_gather %>%
  filter(measure_name %in% c("SLES", "SLES_InverseRank", "sponsor_law_rate")) %>% 
  mutate(measure_name = recode(measure_name, 
                               "SLES_InverseRank" = "SLES Rank", 
                               "sponsor_law_rate" = "Hit Rate"),
         Chamber = Hmisc::capitalize(chamber)) %>%
  mutate(chamber_measure = paste0(Chamber, ": ", measure_name))

figure_one <- fig_one_df %>%
  ggplot(aes(x = measure_value, y = nccpprInverseRank)) + 
  geom_point(size = .75, alpha = .5) + 
  geom_smooth(method = "lm", formula = 'y ~ x') +
  facet_wrap(~chamber_measure, scales = "free") + 
  xlab("Effectiveness Measure Value") + 
  ylab("Inverse NCCPPR Ranking") +
  theme(axis.title = element_text(size=12),
        strip.text.x = element_text(size = 12, face = 'bold'))

figure_one

### Save!
ggsave("Figures/Figure1.pdf", figure_one, height = 5, width = 9)


### Specifications to Replicate Results for Lines of Best Fit in Figure 1: 
summary(lm(nccpprInverseRank ~ measure_value, data = fig_one_df %>% filter(chamber_measure == "House: Hit Rate")))
summary(lm(nccpprInverseRank ~ measure_value, data = fig_one_df %>% filter(chamber_measure == "House: SLES")))
summary(lm(nccpprInverseRank ~ measure_value, data = fig_one_df %>% filter(chamber_measure == "House: SLES Rank")))
summary(lm(nccpprInverseRank ~ measure_value, data = fig_one_df %>% filter(chamber_measure == "Senate: Hit Rate")))
summary(lm(nccpprInverseRank ~ measure_value, data = fig_one_df %>% filter(chamber_measure == "Senate: SLES")))
summary(lm(nccpprInverseRank ~ measure_value, data = fig_one_df %>% filter(chamber_measure == "Senate: SLES Rank")))


rm(figure_one, fig_one_df)


#########################################################################################################
##### Table A4: Evaluating the Explanatory Power of Effectiveness Measures in North Carolina
#########################################################################################################
# -- Note: For RMSE, Estimatr doesn't make it easy to get the residuals via pipe
# ---> Backing out RSS instead... R^2 = 1 - (RSS/TSS) --> RSS = (1 - R^2) * TSS
# ---> RMSE = sqrt(RSS / N)

### Fit 2 Models Per Measure, 1 with Chamber+Term FEs, 1 with Chamber+Term FEs and Covariates Included
# -- The base model includes the effectiveness measure of interest, interacted with an indicator for chamber to account for differing chamber sizes, and term fixed effects. 
# -- In the covariate models, we also add variables related to majority party status, seniority, being in the party leadership or a committee chair, sex, race, ideology, and an indicator for having won a special election or been appointed. 

compare_mods <- nc_gather %>%
  nest(data = -c(measure_name)) %>%
  mutate( # *** Note: Fitting as Eff. Measure * Chamber to Account for differing chamber sizes
    base_fit = map(data, ~ lm_robust(nccpprRanking ~ measure_value*factor(chamber) + factor(term), data = ., se_type = "stata", clusters = sles_id)),
    #base_tidy = map(base_fit, tidy),
    base_r2 = map(base_fit, function(x) x[["r.squared"]]),
    base_rmse = map2(base_fit, base_r2, function(f, r2)  sqrt(((1 - r2) * f[['tss']])/f[['N']])),
    covs_fit = map(data, ~ lm_robust(nccpprRanking ~ measure_value*factor(chamber) + factor(term) + in_majority + seniority + Leader + comm_chair + female + I(pred.race == 'black') + I(pred.race == "hispanic") + ideo_med_distance + appt_or_won_special, data = ., se_type = "stata", clusters = sles_id)),
    #covs_tidy = map(covs_fit, tidy),
    covs_r2 = map(covs_fit, function(x) x[["r.squared"]]),
    covs_rmse = map2(covs_fit, covs_r2, function(f, r2)  sqrt(((1 - r2) * f[['tss']])/f[['N']])),
  ) %>%
  unnest(c(base_r2, base_rmse, covs_r2, covs_rmse)) 

### Create/Format Table
table_a4 <- compare_mods %>%
  select(-c(data, base_fit, covs_fit)) %>%
  mutate(measure_name = recode(measure_name, 
                               "bayeshitrate" = "Bayesian Hit Rate (Edwards 2018)", 
                               "bayes_rank" = "Bayesian Hit Rate Rank",
                               "matthewsHitRate" = "Hit Rate (Edwards 2018)", 
                               "SLES_InverseRank" = "SLES Rank", 
                               "sponsor_law_rate" = "Hit Rate (SLES Data)",
                               "sponsor_pass_rate" = "Passage Rate (SLES Data)")) %>%
  slice(4, 5, 3, 2, 1, 6, 7) %>%
  set_names("Effectiveness Measure", " R2 ", " RMSE ", "R2", "RMSE") %>% 
  as_hux() %>%
  insert_row("", "Base Model", "", "Covariate Model", "", after = 0) %>% 
  merge_cells(1, 2:3) %>%
  merge_cells(1, 4:5) %>%
  set_align(1, everywhere, "center") 

table_a4

### Export Table
huxtable::quick_docx(table_a4, file = "Tables/TableA4.docx", open = FALSE)

rm(table_a4, compare_mods)

